The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(dtplyr)library(data.table)
Warning: package 'data.table' was built under R version 4.4.1
Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':
between, first, last
# met dataif (!file.exists("met_all.gz"))download.file(url ="https://raw.githubusercontent.com/USCbiostats/data-science-data/master/02_met/met_all.gz",destfile ="met_all.gz",method ="libcurl",timeout =60 )met <- data.table::fread("met_all.gz")# Download the datastations <-fread("https://noaa-isd-pds.s3.amazonaws.com/isd-history.csv")stations[, USAF :=as.integer(USAF)]
Warning in eval(jsub, SDenv, parent.frame()): NAs introduced by coercion
#stations <- as.data.frame(stations)# Dealing with NAs and 999999stations[, USAF :=fifelse(USAF ==999999, NA_integer_, USAF)]stations[, CTRY :=fifelse(CTRY =="", NA_character_, CTRY)]stations[, STATE :=fifelse(STATE =="", NA_character_, STATE)]# Selecting the three relevant columns, and keeping unique recordsstations <-unique(stations[, list(USAF, CTRY, STATE)])# Dropping NAsstations <- stations[!is.na(USAF)]# Removing duplicatesstations[, n :=1:.N, by = .(USAF)]stations <- stations[n ==1,][, n :=NULL]#stations <- as.data.frame(stations)# Mergingdat <-merge(# Datax = met, y = stations, # List of variables to matchby.x ="USAFID",by.y ="USAF", # Which obs to keep?all.x =TRUE, all.y =FALSE )head(dat[, c('USAFID', 'WBAN', 'STATE')], n =4)
Key: <USAFID>
USAFID WBAN STATE
<int> <int> <char>
1: 690150 93121 CA
2: 690150 93121 CA
3: 690150 93121 CA
4: 690150 93121 CA
#changing combined dataset from data.table to data.framedat <-as.data.frame(dat)
Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
always returns an ungrouped data frame and adjust accordingly.
`summarise()` has grouped output by 'STATE'. You can override using the
`.groups` argument.
median_stations
# A tibble: 46 × 4
# Groups: STATE [46]
STATE rep_temp rep_wind rep_press
<chr> <int> <int> <int>
1 AL 720265 720265 720361
2 AR 720175 720172 720175
3 AZ 720339 720339 722720
4 CA 690150 690150 690150
5 CO 720528 720385 722817
6 CT 725027 720545 725027
7 DE 724088 724088 724093
8 FL 720373 720373 720383
9 GA 720257 720257 722070
10 IA 720293 720293 725420
# ℹ 36 more rows
median_list <-c(median_temp, median_wind, median_press)#creating function to calculate euclidean distanceeuclid_dist <-function(temp, wind, pressure) {sqrt((temp - median_list[1])^2+ (wind - median_list[2])^2+ (pressure - median_list[3])^2)}#calculating distances for each stationdat1 <- dat |>rowwise() |>mutate(distance =euclid_dist(temp, wind.sp, atm.press)) |>ungroup()#finding the representative station per staterepresentative_per_state <- dat1 |>group_by(STATE) |>filter(distance ==min(distance, na.rm =TRUE)) |>arrange(lat) |>slice(1)
Warning: There were 2 warnings in `filter()`.
The first warning was:
ℹ In argument: `distance == min(distance, na.rm = TRUE)`.
ℹ In group 26: `STATE = "ND"`.
Caused by warning in `min()`:
! no non-missing arguments to min; returning Inf
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.